# Import required R libraries
#library(AppliedPredictiveModeling)
library(caret)
library(tidyverse)
#library(pls)
#library(elasticnet)
#library(corrplot)
library(readxl)
library(writexl)
library(naniar)
library(corrplot)

This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.

Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.

Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.

# Read in Excel file
bev_data_raw <- read_excel("data/StudentData.xlsx")

# Oberservations: 2571
# Columns: 33
dim(bev_data_raw)
## [1] 2571   33
str(bev_data_raw)
## tibble [2,571 × 33] (S3: tbl_df/tbl/data.frame)
##  $ Brand Code       : chr [1:2571] "B" "A" "B" "A" ...
##  $ Carb Volume      : num [1:2571] 5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill Ounces      : num [1:2571] 24 24 24.1 24 24.3 ...
##  $ PC Volume        : num [1:2571] 0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb Pressure    : num [1:2571] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb Temp        : num [1:2571] 141 140 145 133 137 ...
##  $ PSC              : num [1:2571] 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC Fill         : num [1:2571] 0.26 0.22 0.34 0.42 0.16 ...
##  $ PSC CO2          : num [1:2571] 0.04 0.04 0.16 0.04 0.12 ...
##  $ Mnf Flow         : num [1:2571] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb Pressure1   : num [1:2571] 119 122 120 115 118 ...
##  $ Fill Pressure    : num [1:2571] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd Pressure1    : num [1:2571] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure2    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure3    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure4    : num [1:2571] 118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler Level     : num [1:2571] 121 119 120 118 119 ...
##  $ Filler Speed     : num [1:2571] 4002 3986 4020 4012 4010 ...
##  $ Temperature      : num [1:2571] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage cont       : num [1:2571] 16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb Flow        : num [1:2571] 2932 3144 2914 3062 3054 ...
##  $ Density          : num [1:2571] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num [1:2571] 725 727 735 731 723 ...
##  $ Balling          : num [1:2571] 1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure Vacuum  : num [1:2571] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num [1:2571] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen Filler    : num [1:2571] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl Setpoint    : num [1:2571] 120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure Setpoint: num [1:2571] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air Pressurer    : num [1:2571] 143 143 142 146 146 ...
##  $ Alch Rel         : num [1:2571] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb Rel         : num [1:2571] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling Lvl      : num [1:2571] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
# 1 chr column: Brand Code
# Remainder are number columns
# PH is the result column

summary(bev_data_raw)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1
# Check missing data
vis_miss(bev_data_raw)

# Near Zero Variance Columns
nzv_cols <- nearZeroVar(bev_data_raw)
length(nzv_cols)
## [1] 1
nzv_cols
## [1] 13
# Apparently just column 13 (Hyd Pressure1)
bev_data_raw[13] 
## # A tibble: 2,571 × 1
##    `Hyd Pressure1`
##              <dbl>
##  1               0
##  2               0
##  3               0
##  4               0
##  5               0
##  6               0
##  7               0
##  8               0
##  9               0
## 10               0
## # … with 2,561 more rows
corr <- bev_data_raw %>% drop_na() %>%
  select(-c('Brand Code')) %>% cor()

corrplot(corr, method="number")

# Currently unreadable below
# Feature plot for the numeric predictor variables against the result variable PH
cols <- bev_data_raw %>%
  select(-c('Brand Code', 'PH')) %>% colnames()

#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "ellipse") 
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "strip", jitter = TRUE)
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "box") 
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "pairs") 
featurePlot(bev_data_raw[,cols], 
            bev_data_raw$PH, 
            plot="scatter",
            type = c("p", "smooth"),
            span = .5,
            layout=c(1,1))

#30#

apply(bev_data_raw[,cols],2,shapiro.test)
## $`Carb Volume`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.96797, p-value < 2.2e-16
## 
## 
## $`Fill Ounces`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.99317, p-value = 1.622e-09
## 
## 
## $`PC Volume`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.98309, p-value < 2.2e-16
## 
## 
## $`Carb Pressure`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.99681, p-value = 3.582e-05
## 
## 
## $`Carb Temp`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.99469, p-value = 6.316e-08
## 
## 
## $PSC
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.95337, p-value < 2.2e-16
## 
## 
## $`PSC Fill`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.9407, p-value < 2.2e-16
## 
## 
## $`PSC CO2`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.83089, p-value < 2.2e-16
## 
## 
## $`Mnf Flow`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.74864, p-value < 2.2e-16
## 
## 
## $`Carb Pressure1`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.99065, p-value = 8.597e-12
## 
## 
## $`Fill Pressure`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.91452, p-value < 2.2e-16
## 
## 
## $`Hyd Pressure1`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.88303, p-value < 2.2e-16
## 
## 
## $`Hyd Pressure2`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.80674, p-value < 2.2e-16
## 
## 
## $`Hyd Pressure3`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.81729, p-value < 2.2e-16
## 
## 
## $`Hyd Pressure4`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.95761, p-value < 2.2e-16
## 
## 
## $`Filler Level`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.86234, p-value < 2.2e-16
## 
## 
## $`Filler Speed`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.43993, p-value < 2.2e-16
## 
## 
## $Temperature
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.82493, p-value < 2.2e-16
## 
## 
## $`Usage cont`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.87732, p-value < 2.2e-16
## 
## 
## $`Carb Flow`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.75487, p-value < 2.2e-16
## 
## 
## $Density
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.83495, p-value < 2.2e-16
## 
## 
## $MFR
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.44664, p-value < 2.2e-16
## 
## 
## $Balling
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.79293, p-value < 2.2e-16
## 
## 
## $`Pressure Vacuum`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.96387, p-value < 2.2e-16
## 
## 
## $`Oxygen Filler`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.75741, p-value < 2.2e-16
## 
## 
## $`Bowl Setpoint`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.80247, p-value < 2.2e-16
## 
## 
## $`Pressure Setpoint`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.73661, p-value < 2.2e-16
## 
## 
## $`Air Pressurer`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.70652, p-value < 2.2e-16
## 
## 
## $`Alch Rel`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.719, p-value < 2.2e-16
## 
## 
## $`Carb Rel`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.94542, p-value < 2.2e-16
## 
## 
## $`Balling Lvl`
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.72188, p-value < 2.2e-16
ggplot(data = bev_data_raw) +
  geom_bar(mapping = aes(x = `Brand Code`))

ggplot(data = bev_data_raw, mapping = aes(x = `Brand Code`, y = PH)) +
  geom_boxplot()